qui {
noisily {
	/*************************************************/
	/******** Step 1.8. Institutions			 *****/
	/*************************************************/

/* 1.8.1. Democracy */
/*
To gain panel data on the evolution of democracy, our strategy is the following:
	- Collect data on democracy indexes from various sources
	- Linearly interpolate missing values in all data sources
	- Select a base series, the most complete dataset
	- From the additional sources, select the one for which the overlapping paths are most strongly correlated with the base series.
	- Estimate a third-order polynomial in the value of the alternative index of democracy, year and region dummies to approximate missing data
	- Use the implied growth rates in these estimated trajectories to maximally extend the baseline data forward and backwards.
	- Proceed by using the second-third-... most strongly correlated source and fill in as much missings as possible
	- As data on 2015 and 2016 are missing in all data sources, extend the index forward by assuming a linear trend
	- Finally, rescale the index such that it has a minimal value of 0:
	
		DEM_it = DEM_it - MIN(DEM)
		
	- Result = composite index of democracy
	
*/
}
/* 1.8.1.1. Load data */
cd ..
cd ".\3. Intermediary results"

foreach name in "AEJ" "CLIO_dem" "GM2" "UDS" "VH"  "FHI" "P2"  {
		merge 1:1 year cntrycode_`name' using `name', keepusing(dem_`name')
		destring dem_`name', replace force
		drop if _merge == 2
		drop _merge
		drop if cntry == ""
		}
rename dem_CLIO_dem dem_CLIO

/* 1.8.1.2. Linearly interpolate missing values in all datasets and standardize values */
sort cntry year
local source = 0

foreach dataset in "CLIO" "VH" "GM2" "UDS" "FHI" "AEJ" "P2"  {
	local source = `source'+1
	local source2 = `source'+900
	gen interpolated_dem_`dataset' = 0 if dem_`dataset' != .
	gen dem_`dataset'_raw = dem_`dataset' 								// Keep raw data, to track number of linearly interpolated contributions
	by cntry: ipolate dem_`dataset' year, generate(dem_`dataset'1)
	sum cntrycode if year >= $startyear & independence_years != . & dem_`dataset' == . & dem_`dataset'1 != .
	replace dem_`dataset' = dem_`dataset'1
	drop dem_`dataset'1
	replace interpolated_dem_`dataset' = 1 if interpolated_dem_`dataset' == . & dem_`dataset' != .
	}

/* 1.8.1.3. Declare panel structure, define CLIO data as baseline series (most extensive) */
* Note: CLIO data only available until 2000 (except for New Zealand)

	xtset cntrycode year

	gen dem = dem_CLIO if independence_years != . & year >= $startyear
	gen source_dem = 1 if dem != .
	scalar correlation_dem_1 = 1
	
	* Identify interpolated parts
	gen interpolated_dem = 0 & dem != .
	replace interpolated_dem = 1 if interpolated_dem_CLIO == 1 & dem != .

	* Identify extrapolated parts
	gen extrapolated_dem = 0  if dem != .

	* identify polynomial predicted parts
	gen polynomialpredicted_dem = 0  if dem != .

/* 1.8.1.4. Maximally extend forward and backward by growth rates implied in VH data */
* Note: VH data available until 2012
		
			
	* Show correlation with baseline data
	noi di "Correlation between CLIO and VH data"
	noisily corr dem_CLIO dem_VH if independence_years != . & year >= $startyear
	scalar correlation_dem_2 = round(`r(rho)',.01)

	xtset cntrycode year
				
	foreach dataset in "VH" {
																	
		* Extend forward 
		local i = 999
		while `i' != 0 {
			sum dem if independence_years != . & year >= $startyear
			local original = r(N)
			replace dem = (1+(dem_`dataset'-L.dem_`dataset')/L.dem_`dataset')*L.dem if dem == .
			sum dem if independence_years != . & year >= $startyear
			local extended = r(N)
			local i = `extended' - `original'
			}
							
		* Extend backward
		local i = 999
		while `i' != 0 {
			sum dem if independence_years != . & year >= $startyear
			local original = r(N)
			replace dem = (1+(dem_`dataset'-F.dem_`dataset')/F.dem_`dataset')*F.dem if dem == .
			sum dem if independence_years != . & year >= $startyear
			local extended = r(N)
			local i = `extended' - `original'
			}

		* Identify source
		replace source_dem = 992 if source_dem == . & dem != .
		
		* Identify interpolated parts
		replace interpolated_dem = 1  if dem != . & interpolated_dem_`dataset' == 1 & interpolated_dem == .
					
		* Identify extrapolated parts
		replace extrapolated_dem = 1  if dem != . & extrapolated_dem == .
		
		* Extend for countries missing in non-overlapping time periods in reference data series
		replace dem = dem_`dataset' if dem == .  & independence_years != . & year >= $startyear

		* Identify source
		replace source_dem = 2 if source_dem == . & dem != .
		
		* Identify interpolated parts
		replace interpolated_dem = 1  if dem != . & interpolated_dem_`dataset' == 1 & interpolated_dem == .
				
		* Identify extrapolated parts
		replace extrapolated_dem = 0 if dem != . & extrapolated_dem == .
			
		* Identify polynomial predicted parts
		replace polynomialpredicted_dem = 0 if dem != . & polynomialpredicted_dem == .
		}
		
/* 1.8.1.5. Predict missing country-years using other sources */

	* Show correlation with baseline data
	foreach dataset in "GM2" "UDS" "FHI" "AEJ" "P2" {
		noisily corr dem_`dataset' dem
		}

/* 1.8.1.6. Predict missing country-years using other sources */

noi di "Calculate predictions based on:", _continue
foreach dataset in "GM2" "UDS" "FHI" "AEJ" "P2" {
	noi di "`dataset'", _continue
	
		* Generate indicators:
					
			* Squared and cubic term democracy indicator:
			
			gen dem_`dataset'_sq = dem_`dataset'^2
			gen dem_`dataset'_cu = dem_`dataset'^3
					
			* Year dummies
			
			tab year if year >= $startyear, gen(year_)
						
			local tyear = 2016 - $startyear + 1
			forval i = 1/`tyear' {
				local j = `i' + $startyear - 1
				rename year_`i' year_`j'
				}

			* Region dummies
			
			tab regioncode, gen(region_)

			* Interactions:
									
					* region and year				
				
					forval i = ${startyear}/2016 {
						forval j = 1/7 {
						gen year_`i'Xregion_`j' = year_`i' * region_`j'
						}
					}
								
					* year and democracy proxy
					
					forval i = ${startyear}/2016 {
						gen year_`i'X`dataset' = year_`i' * dem_`dataset'
						gen year_`i'X`dataset'_sq = year_`i' * dem_`dataset' * dem_`dataset'
						}
								
					* region and democracy proxy
					
					forval j = 1/7 {
						gen region_`j'X`dataset' = region_`j' * dem_`dataset'
						gen region_`j'X`dataset'_sq = region_`j' * dem_`dataset' * dem_`dataset'
						}
									
					* year, region and democracy proxy
					
						forval i = ${startyear}/2016 {
							forval j = 1/7 {
								gen year_`i'Xregion_`j'X`dataset' = year_`i' * region_`j' * dem_`dataset'
								}
							}

				* Predictions: 
				local startyear = $startyear
				cap reg dem dem_`dataset' dem_`dataset'_sq dem_`dataset'_cu year_* region_* if independence_years != . & year >= $startyear
				while _rc != 0 {	
					local startyear = `startyear' + 1 		// if estimation breaks down due to non-positive definite matrix: move estimation period up by one year
					cap reg dem dem_`dataset' dem_`dataset'_sq dem_`dataset'_cu year_`startyear'-year_2016 region_* if independence_years != . & year >= `startyear'
					}

				scalar appr_dem_`dataset'_r2 = round(e(r2_a), .001)
				predict appr_dem_`dataset' if independence_years != . & year >= $startyear
				
				predict appr_dem_`dataset'_se if independence_years != . & year >= $startyear, stdp
				local N = e(df_r)
				gen appr_dem_`dataset'_ub = appr_dem_`dataset' + invttail(`N',.005)*appr_dem_`dataset'_se
				gen appr_dem_`dataset'_lb = appr_dem_`dataset' - invttail(`N',.005)*appr_dem_`dataset'_se
				gen predictive_accuracy_`dataset' = .
				replace predictive_accuracy_`dataset' = 1 if dem !=. & appr_dem_`dataset' != . & appr_dem_`dataset'_lb <= dem & dem <= appr_dem_`dataset'_ub
				replace predictive_accuracy_`dataset' = 0 if dem !=. & appr_dem_`dataset' != . & appr_dem_`dataset'_lb > dem | dem !=. & appr_dem_`dataset' != . & dem > appr_dem_`dataset'_ub
				replace predictive_accuracy_`dataset' = 0 if dem !=. & appr_dem_`dataset' != . & appr_dem_`dataset'_lb > dem | dem !=. & appr_dem_`dataset' != . & dem > appr_dem_`dataset'_ub
				sum predictive_accuracy_`dataset' if drop == 0
				scalar predictive_accuracy_`dataset' = round(r(mean),.0001)
							
				* Drop unnecesaries:
				
				drop year_* region_* dem_`dataset'_sq dem_`dataset'_cu appr_dem_`dataset'_* predictive_*
				}
noi di ""

/* 1.8.1.7. Report implications */

noi di ""
noi di "Correlation of predictions with reference data:"
local source = 3
foreach dataset in "GM2" "UDS" "FHI" "AEJ" "P2" {
	noi corr dem appr_dem_`dataset' if year > $startyear & independence_years != .
	scalar correlation_dem_`source' = round(`r(rho)',.01)
	local source = `source'+1
	}

noi di "Predictive accuracy:"
foreach dataset in "GM2" "UDS" "FHI" "AEJ" "P2" {
	noi scalar list predictive_accuracy_`dataset'
}

noi di "Adjusted R squared of regressions"
foreach dataset in "GM2" "UDS" "FHI" "AEJ" "P2" {
	noi scalar list appr_dem_`dataset'_r2 
	}
			
noi di "Approximated trajectories: summary statistics"
noi sum dem appr_dem_* if independence_years != . & year >= $startyear
sort cntrycode year

/* 1.8.1.8. Extend baseline data */
noi di "Extend baseline data using:", _continue
local source = 2

foreach dataset in "GM2" "UDS" "FHI" "AEJ" "P2" {

	local source = `source'+1
	local source2 = `source'+990
	noi di "`dataset'", _continue
		
	* Extend forward 
	local i = 999
	while `i' != 0 {
		sum dem if independence_years != . & year >= $startyear
		local original = r(N)
		replace dem = (1+(appr_dem_`dataset'-L.appr_dem_`dataset')/L.appr_dem_`dataset')*L.dem if dem == .
		sum dem if independence_years != . & year >= $startyear
		local extended = r(N)
		local i = `extended' - `original'
		}
	
	* Extend backward
	local i = 999
	while `i' != 0 {
		sum dem if independence_years != . & year >= $startyear
		local original = r(N)
		replace dem = (1+(appr_dem_`dataset'-F.appr_dem_`dataset')/F.appr_dem_`dataset')*F.dem if dem == .
		sum dem if independence_years != . & year >= $startyear
		local extended = r(N)
		local i = `extended' - `original'
		}
		
	* Identify source
	replace source_dem = `source2' if source_dem == . & dem != .

	* Extend for countries missing in reference data series
	replace dem = appr_dem_`dataset' if dem == . & independence_years != . & year >= $startyear

	* Identify source
	replace source_dem = `source' if source_dem == . & dem != .
	
	* Identify interpolated parts
	replace interpolated_dem = 0  if dem != . & interpolated_dem == .
					
	* Identify extrapolated parts
	replace extrapolated_dem = 0 if dem != . & extrapolated_dem == .
					
	* Identify polynomial predicted parts
	replace polynomialpredicted_dem = 1 if dem != . & polynomialpredicted_dem == .
	}

noi di ""

/* 1.8.1.9. Linearly interpolate missings between non-overlapping parts, predict 2015 & 2016 values by assuming a linear trend and rescale the index */
* Note, no values available for 2015 or 2016 in any dataset

		* Interpolate missing values
		sort cntry year
		by cntry: ipolate dem year, generate(dem1)
		replace dem = dem1
		drop dem1
		sort cntry year
		replace interpolated_dem = 1 if interpolated_dem == . & dem != .
		replace source_dem = 0 if source_dem == . & dem != .

		* Extend to 2015 & 2016 by assuming a linear trend
		gen lineartrend = year- $startyear + 1
		reg dem i.cntrycode##c.lineartrend
		predict pdem if year >= 2015
		replace dem = pdem if year >= 2015 & dem == .
		drop pdem lineartrend
		replace extrapolated_dem = 1 if extrapolated_dem == . & dem != .
		replace source_dem = 0 if source_dem == . & dem != .

		* Rescale composite index such that it has a minimal value of 0
		sum dem
		replace dem = dem - `r(min)'
		sum dem
			
/* 1.8.1.10. Report composition of data */
		
	noi di "Baseline data: summary statistics"
	noi sum dem if independence_years != . & year >= $startyear

/* 1.8.1.11. Label variables */

label var dem "Vanhanen index of democracy (Composite index)"

/* 1.8.12. Drop original data */

drop dem_* appr_*

* Reroute to directory containing dofiles
cd ..
cd ".\1. Dofiles"
}
